{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "20dbecf0-cd27-4bed-a2cd-7c9e31005806",
   "metadata": {},
   "source": [
    "# 预训练"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "29086589-ef59-4edf-a6e8-69cd6ee2010c",
   "metadata": {},
   "source": [
    "镜像获取：\n",
    "\n",
    "```shell\n",
    "docker pull swr.cn-southwest-2.myhuaweicloud.com/atelier/pytorch_2_3_ascend:pytorch_2.3.1-cann_8.0.rc3-py_3.10-hce_2.0.2409-aarch64-snt9b-20241213131522-aafe527\n",
    "\n",
    "docker run -it --privileged --name=llm -u root --net=host --ipc=host \\\n",
    "--device=/dev/davinci0 \\\n",
    "--device=/dev/davinci1 \\\n",
    "--device=/dev/davinci2 \\\n",
    "--device=/dev/davinci3 \\\n",
    "--device=/dev/davinci4 \\\n",
    "--device=/dev/davinci5 \\\n",
    "--device=/dev/davinci6 \\\n",
    "--device=/dev/davinci7 \\\n",
    "--device=/dev/davinci_manager \\\n",
    "--device=/dev/devmm_svm \\\n",
    "--device=/dev/hisi_hdc \\\n",
    "-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \\\n",
    "-v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \\\n",
    "-v /usr/local/sbin/:/usr/local/sbin/ \\\n",
    "-v /var/log/npu/slog/:/var/log/npu/slog \\\n",
    "-v /var/log/npu/profiling/:/var/log/npu/profiling \\\n",
    "-v /var/log/npu/dump/:/var/log/npu/dump \\\n",
    "-v /var/log/npu/:/usr/slog \\\n",
    "-v /etc/hccn.conf:/etc/hccn.conf \\\n",
    "-v /home/icbc:/mnt \\\n",
    "-w /mnt \\\n",
    "419f2a9943a4 \\\n",
    "/bin/bash\n",
    "\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d8f06a27-801e-43be-8980-1579f519ec49",
   "metadata": {},
   "source": [
    "### 1 数据预览"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0903320-f0ea-4552-ad73-da2655322c45",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "! head -2 wikipedia-zh-cn-8192.jsonl"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "140d4633-c664-4d71-8e61-51eb180b5b33",
   "metadata": {},
   "source": [
    "### 2 工具安装"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d0b0e1ed-a1a7-4b98-927e-64d69d274098",
   "metadata": {},
   "source": [
    "```shell\n",
    "git pull https://gitee.com/janelu9/EasyLLM\n",
    "cd EasyLLM\n",
    "pip wheel -e . --no-deps && pip install jllm-*-py3-none-any.whl\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d6e475aa-6f85-4db5-bf06-9a411ca553f4",
   "metadata": {},
   "source": [
    "### 3 数据转换"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f3165bcd-0af4-4ae9-bbbc-3880a3b87ad4",
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m jllm.raw2ids \\\n",
    "    --tokenizer Qwen2.5-7B-Instruct \\\n",
    "    -i wikipedia-zh-cn-512.jsonl \\\n",
    "    --max_len 8193 \\\n",
    "    -t pretrain --stack -C \n",
    "# --stack 拼接token凑成max_len的长度，减少pad_id; \n",
    "# -C 清除缓存重新转换 "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2d434280-b77e-4fd6-ba6d-0353b0a38b02",
   "metadata": {},
   "source": [
    "### 3.1 数据检查(可选步骤)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "80ddfc47-1201-4a3a-9163-27d4a01cba36",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "import pyarrow.parquet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a737ecb1-568b-443f-9721-2373c9e38827",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer=AutoTokenizer.from_pretrained('Qwen2.5-7B-Instruct')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "1d2dbade-d4d6-4984-b9e3-05a68acf2ee6",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pyarrow.parquet.read_table('wikipedia-zh-cn-512_Qwen2.5-7B-Instruct/wikipedia-zh-cn-512-00000')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9fb4a43c-9561-4383-8034-592267abc476",
   "metadata": {},
   "outputs": [],
   "source": [
    "input_ids =data['input_ids'].to_numpy().tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "9b9e7bd3-496f-433f-a874-b3b892749e49",
   "metadata": {},
   "outputs": [],
   "source": [
    "idx=0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "57d42ded-1143-453c-a49a-2779bf0b7a93",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print(tokenizer.decode(input_ids[idx]))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "37a29a19-4fb7-44a9-ae68-47eff22fa1f2",
   "metadata": {},
   "source": [
    "### 4 模型训练"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6d692b1c-f84a-48ad-9f68-b9695dcfa93f",
   "metadata": {},
   "source": [
    "#### 注: NPU下使用张量并行(model_parallel_size>1)需要Megatron和MindSpeed。"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8178837a-f241-452f-9a05-7d5b9a124cd3",
   "metadata": {},
   "source": [
    "```shell\n",
    "# 获取megatron\n",
    "git clone https://github.com/NVIDIA/Megatron-LM.git\n",
    "cd Megatron-LM\n",
    "git checkout core_r0.8.0\n",
    "cp -r megatron ../\n",
    "# 安装mindspeed\n",
    "git clone -b 2.0.0_core_r0.8.0 https://gitee.com/ascend/MindSpeed.git\n",
    "pip install -e MindSpeed --user\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f26c72d-502e-4797-ba5f-bdb847b673de",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!deepspeed --module jllm.train_pipe \\\n",
    "    --model Qwen2.5-7B-Instruct \\\n",
    "    --num_train_epochs 1 \\\n",
    "    --train_data wikipedia-zh-cn-512_Qwen2.5-7B-Instruct \\\n",
    "    --pipe_parallel_size 2 \\\n",
    "    --model_parallel_size 2 \\\n",
    "    --sequence_parallel_size 1 \\\n",
    "    --per_device_train_batch_size 1 \\\n",
    "    --global_batch_size 16 \\\n",
    "    --partition_method fast \\\n",
    "    --output_dir pretrained \\\n",
    "    --max_num_checkpoints 2 \\\n",
    "    --split_dlayer \\\n",
    "    --learning_rate 1e-5 |tee pretrain.log\n",
    "#注释：\n",
    "# --model 模型路径至少需要包含config.json\n",
    "# --num_train_epochs 训练轮数\n",
    "# --train_data 训练数据\n",
    "# --pipe_parallel_size 流水线并行个数\n",
    "# --model_parallel_size 张量并行个数\n",
    "# --per_device_train_batch_size 一次输入训练多少样本\n",
    "# --global_batch_size 训练完多少样本后（累加完多少个梯度后）进行一次参数更新\n",
    "# --partition_method fast 流水线拆分策略\n",
    "# --checkpoint checkpoint 模型检查点目录\n",
    "# --output_dir pretrained 最终模型输出目录\n",
    "# --max_num_checkpoints 2 最大保留多少个检查点\n",
    "# --split_dlayer 是否拆分docoder layer\n",
    "# --learning_rate 1e-5 学习率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51a7f04f-55e3-439d-a945-1b0ccda87d8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "!grep  'steps:.*loss:' pretrain.log|awk '{print $2,$4}'>pretrain.loss\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "xy=np.loadtxt('pretrain.loss')  \n",
    "plt.plot(xy[:,0], xy[:,1])  \n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4bca33a0-d00d-47eb-af28-bac44d9327b4",
   "metadata": {},
   "source": [
    "##### 正常情况下形如："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b911c41-6d82-4702-9656-74f3d5f762fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(xy[:,0], 1/xy[:,0])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8ac91532-f8f0-4bb5-9467-11e3bd00a575",
   "metadata": {},
   "source": [
    "### 5 参数转换"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8f01fbc2-a6bf-4d5a-9835-5b73550015f7",
   "metadata": {},
   "source": [
    "#### 5.1 checkpoint转huggingface"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84f687c0-1659-4f5a-b5d3-762c59821ba8",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!deepspeed --module jllm.train_pipe \\\n",
    "    --model Qwen2.5-7B-Instruct \\\n",
    "    --train_data wikipedia-zh-cn-512_Qwen2.5-7B-Instruct \\\n",
    "    --pipe_parallel_size 2 \\\n",
    "    --model_parallel_size 2 \\\n",
    "    --partition_method fast \\\n",
    "    --split_dlayer \\\n",
    "    --num_train_epochs 0 \\\n",
    "    --from_ckpt checkpoint \\\n",
    "    --output_dir pretrained\n",
    "#--model 模型路径\n",
    "#--train_data 训练数据\n",
    "#--pipe_parallel_size 流水线长度\n",
    "#--partition_method 流水线拆分方法\n",
    "#--split_dlayer 将decoder layer拆开,使流水线分布更均匀\n",
    "#--from_ckpt 加载模型参数的checkpoint路径\n",
    "#--output_dir  输出huggingface格式模型的路径"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7956374-28f3-4248-a8b8-0cdbfb21a0e4",
   "metadata": {},
   "source": [
    "#### 5.2 合并拆分张量(model_parallel_size>=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bfeb9775-1548-461f-816e-42708bb2a5ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m jllm.cat2hf \\\n",
    "    -C pretrained \n",
    "# -C 合并前的模型路径\n",
    "# -H 合并后的huggingface格式的模型路径。不填的自行创建为pretrained_hf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "912a9764-6243-44c7-a1b6-cb1a34cfc0f1",
   "metadata": {},
   "source": [
    "### 6 模型测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "6181c5c9-757f-4259-a551-68901c209216",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "tokenizer=AutoTokenizer.from_pretrained('Qwen2.5-7B-Instruct')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3182983b-1459-4db8-aefd-b4b017f2553c",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    'pretrained_hf',\n",
    "    torch_dtype=\"auto\",\n",
    "    device_map=\"auto\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "ac4396f4-8a4a-4b3e-b99c-3751f6093b77",
   "metadata": {},
   "outputs": [],
   "source": [
    "text='数学是研究数量、结构以及空间等概念及其变化的一门学科，属于'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "a6f28f48-626a-4fdf-8039-ae32c8455620",
   "metadata": {},
   "outputs": [],
   "source": [
    "from time import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36fea6b9-b426-437c-85fc-52e5b4682aa3",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_inputs = tokenizer([text], return_tensors=\"pt\").to(model.device)\n",
    "st=time()\n",
    "generated_ids = model.generate(\n",
    "    **model_inputs,\n",
    "    max_new_tokens=128\n",
    ")\n",
    "du=time()-st\n",
    "tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "7d693b60-901e-4c44-952b-e8b3163bc313",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7.7173362469042015"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "128/du"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
